# Computations
import numpy as np
import pandas as pd
# sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score,\
KFold, StratifiedShuffleSplit, ShuffleSplit, learning_curve
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
from matplotlib.font_manager import FontProperties
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this study, we analyze HR data available from kaggle.com. We have already analyzed the dataset and prepared it for modeling (see the details here).
df = pd.read_csv('Data/WA_Fn-UseC_-HR-Employee-Attrition_STD.csv')
Target = 'Attrition'
Labels = df[Target].unique().tolist()
In the dataset, Attrition represents whether an employee is churned or not. We would like to create a predictive model that predicts this feature.
Aditional_Columns = [Target, 'Employee Number']
X = df.drop(columns = Aditional_Columns)
y = df[Target]
fig, ax = plt.subplots(figsize=(17,20))
Temp = pd.concat([X, df[Target]], axis = 1)
Temp = Temp.corr().round(2)
Temp = Temp.loc[(Temp.index == Target)].drop(columns = Target).T.sort_values(by = Target).T
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True, cmap =sns.color_palette("Greens", n_colors=10),
linewidths = 0.8, vmin=0, vmax=1,
annot_kws={"size": 12},
cbar_kws={'label': Target + ' Correlation', "aspect":40, "shrink": .4, "orientation": "horizontal"})
_ = ax.set_yticklabels('')
del Temp
Colors = ['SeaGreen', 'FireBrick']
fig = go.Figure(data=[go.Pie(labels=Labels, values=y.value_counts().values, pull=[0, 0.1],
textfont=dict(size=16),
marker=dict(colors = Colors, line=dict(color='black', width=1)))])
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"),
legend_title_text= Target,
annotations=[dict(text= '<b>' + Target + '<b>', x=0.5, y=0.5, font_size=14, showarrow=False)],
# title={'text': '<b>' + Target + '<b>', 'x':0.48, 'y': .83, 'xanchor': 'center', 'yanchor': 'top'}
)
fig.show()
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
X_train, X_test = X.loc[train_index], X.loc[test_index]
y_train, y_test = y[train_index], y[test_index]
del sss
Colors = ['SeaGreen', 'FireBrick']
nc = 2
fig = make_subplots(rows=1, cols=nc, specs=[[{'type':'domain'}]*nc])
fig.add_trace(go.Pie(labels=Labels,
values=y_train.value_counts().values,
pull=[0, 0.1],
name= 'Train Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 1)
fig.add_trace(go.Pie(labels=Labels,
values=y_test.value_counts().values,
pull=[0, 0.1],
name= 'Test Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"),
legend_title_text= Target,
annotations=[dict(text= '<b>' + 'Train<br>Set' + '<b>', x=0.195, y=0.5, font_size=14, showarrow=False),
dict(text= '<b>' + 'Test<br>Set' + '<b>', x=0.8, y=0.5, font_size=14, showarrow=False)],
title={'text': '<b>' + Target + '<b>', 'x':0.48, 'y': .83, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
In this article, we implement scikit-learn's GaussianNB function which implements the Gaussian Naive Bayes algorithm for classification. The likelihood of the features is assumed to be \begin{align} P(x_i \mid y) = \frac{1}{\sqrt{2\pi\sigma^2_y}} \exp\left(-\frac{(x_i - \mu_y)^2}{2\sigma^2_y}\right) \end{align} The parameters $\sigma_y$ and $\mu_y$ are estimated using maximum likelihood.
def Header(Text, L = 100, C1 = Back.BLUE, C2 = Fore.BLUE):
print(C1 + Fore.WHITE + Style.NORMAL + Text + Style.RESET_ALL + ' ' + C2 +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = Fore.BLUE): print(C + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Scoring(model, X, y, n_splits = 20, RS = 42):
kfold = KFold(n_splits= n_splits, random_state= RS, shuffle = True)
ROC = cross_val_score(model, X, y, cv=kfold, scoring = 'roc_auc')
bACC = cross_val_score(model, X, y, cv=kfold, scoring = 'balanced_accuracy')
ROC = ROC[np.logical_not(np.isnan(ROC))]
bACC[np.logical_not(np.isnan(bACC))]
return ROC, bACC
def Performance_Table(model, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test):
Cols = ['Set', 'ROC Accuracy', 'Balanced Accuracy']
ROC, bACC = Scoring(model, X = X_train, y = y_train)
data = ['Train Set', ('%.4f' % ROC.mean())+ ' ± ' + ('%.4f' % ROC.std()),
('%.4f' % bACC.mean())+ ' ± ' + ('%.4f' % bACC.std())]
Out = pd.DataFrame(data = data, index = Cols).T
ROC, bACC = Scoring(model, X = X_test, y = y_test)
data = ['Test Set', ('%.4f' % ROC.mean())+ ' ± ' + ('%.4f' % ROC.std()),
('%.4f' % bACC.mean())+ ' ± ' + ('%.4f' % bACC.std())]
Temp = pd.DataFrame(data = data, index = Cols).T
Out = pd.concat([Out, Temp]).reset_index(drop = True)
return Out
def Classification_Report_CV(model, X, y, n_splits = 20):
Reports = []
CM = []
def classification_report_with_accuracy_score(y_true, y_pred):
Reports.append(pd.DataFrame(metrics.classification_report(y_true, y_pred,
target_names = Labels, output_dict = True)).T.values)
CM.append(metrics.confusion_matrix(y_true, y_pred))
return metrics.accuracy_score(y_true, y_pred)
cross_val_score(model, X=X, y=y, cv=KFold(n_splits = n_splits, shuffle = True), \
scoring=metrics.make_scorer(classification_report_with_accuracy_score))
Reports_All = Reports[0].ravel()
CM_All = CM[0].ravel()
for i in range(1, n_splits):
Reports_All = np.vstack((Reports_All, Reports[i].ravel()))
CM_All = np.vstack((CM_All, CM[i].ravel()))
R = pd.DataFrame(metrics.classification_report(y_train, model.predict(X_train),
target_names = Labels, output_dict = True)).T
Mean = pd.DataFrame(Reports_All.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(Reports_All.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM = CM_All.mean(axis = 0).reshape(CM[0].shape).round(0).astype(int)
# CM = CM_All.sum(axis = 0).reshape(CM[0].shape).round(0).astype(int)
Reports.index.name = 'CV = % i' % n_splits
return CM, Reports
def Confusion_Mat(CM_Train, CM_Test, n_splits = 20):
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
Cmap = ['Greens', 'Purples']
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
for i in range(2):
_ = sns.heatmap(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis],
annot=True, annot_kws={"size": 14}, cmap= Cmap[i], ax = ax[i],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[i].set_title('Normalized Confusion Matrix' + '\n' + Titles[i])
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(Labels)
_ = a.yaxis.set_ticklabels(Labels)
_ = a.set_aspect(1)
def Tables_and_Plots(model):
display(Performance_Table(model).style.hide_index())
Header('Train Set', C1 = Back.GREEN, C2 = Fore.GREEN)
CM_Train, R_Train = Classification_Report_CV(model, X=X_train, y=y_train)
display(R_Train)
Header('Test Set', C1 = Back.RED, C2 = Fore.RED)
CM_Test, R_Test = Classification_Report_CV(model, X=X_test, y=y_test)
display(R_Test)
Line()
Confusion_Mat(CM_Train, CM_Test)
def Grid_Table(grid):
Temp = [str(x) for x in grid.cv_results_['params']]
Temp = [s.replace('{', '').replace('}', '').replace("'", '') for s in Temp]
Table = pd.DataFrame({'rank_test_score': grid.cv_results_['rank_test_score'],
'params':Temp,
'mean_test_score': grid.cv_results_['mean_test_score'],
'mean_fit_time': grid.cv_results_['mean_fit_time']})
Table = Table.round(4).sort_values('rank_test_score').set_index('rank_test_score')
return Table
def Grid_Performance_Plot(Table):
font = FontProperties()
font.set_weight('bold')
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
Z = zip(axes, ['mean_test_score', 'mean_fit_time'], ['Blue', 'Red'],['Classification Accuracy', 'Fit Time (with caching)'])
for ax, col, c, title in Z:
_ = ax.errorbar(x = Table['params'], y = Table[col], yerr = Table[col], color = c)
_ = ax.set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = ax.set_ylim(bottom = 0)
_ = ax.set_xlabel('Paramerers')
_ = ax.set_title(title, fontproperties=font, fontsize = 14)
def Best_Parm(model, param_dist, Top = None,
X_train = X_train, y_train= y_train, X_test = X_test, y_test = y_test):
grid = RandomizedSearchCV(estimator = model, param_distributions = param_dist,
cv = KFold(n_splits = 20, shuffle = True),
n_iter = int(1e3),
scoring = 'precision',
error_score = 0,
verbose = 0,
n_jobs = 10,
return_train_score = True)
_ = grid.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid.best_score_],
'Best Paramerers': [str(grid.best_params_)],
'Precision': [grid.score(X_test,y_test)]}).round(4).style.hide_index().set_precision(4))
Table = Grid_Table(grid)
if Top == None:
Top = Table.shape[0]
display(Table.reset_index(drop = False).head(Top).style.hide_index().\
set_precision(4).background_gradient(subset= ['mean_test_score'], cmap='Greens').\
background_gradient(subset= ['mean_fit_time'], cmap='Oranges'))
Grid_Performance_Plot(Table)
return grid
Some of the metrics that we use here to mesure the accuracy: \begin{align} \text{Confusion Matrix} = \begin{bmatrix}T_p & F_p\\ F_n & T_n\end{bmatrix}. \end{align}
where $T_p$, $T_n$, $F_p$, and $F_n$ represent true positive, true negative, false positive, and false negative, respectively.
\begin{align} \text{Precision} &= \frac{T_{p}}{T_{p} + F_{p}},\\ \text{Recall} &= \frac{T_{p}}{T_{p} + F_{n}},\\ \text{F1} &= \frac{2 \times \text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}\\ \text{Balanced-Accuracy (bACC)} &= \frac{1}{2}\left( \frac{T_{p}}{T_{p} + F_{n}} + \frac{T_{n}}{T_{n} + F_{p}}\right ) \end{align}The accuracy can be a misleading metric for imbalanced data sets. Here, over 88 percent of the sample has negative (No) and about 12 percent has positive (Yes) values. In these cases, a balanced accuracy (bACC) [6] is recommended that normalizes true positive and true negative predictions by the number of positive and negative samples, respectively, and divides their sum by two.
Header('Gaussian Naive-Bayes with Default Parameters')
GNB = GaussianNB()
print('Default Parameters = %s' % GNB.get_params(deep=True))
_ = GNB.fit(X_train, y_train)
Tables_and_Plots(GNB)
Gaussian Naive-Bayes with Default Parameters ======================================================= Default Parameters = {'priors': None, 'var_smoothing': 1e-09}
| Set | ROC Accuracy | Balanced Accuracy |
|---|---|---|
| Train Set | 0.7498 ± 0.0629 | 0.6970 ± 0.0860 |
| Test Set | 0.7615 ± 0.1555 | 0.6827 ± 0.1010 |
Train Set ==========================================================================================
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| CV = 20 | ||||
| 1 | 0.9069 ± 0.0423 | 0.8367 ± 0.0591 | 0.8691 ± 0.0400 | 43.1500 ± 2.7978 |
| 0 | 0.4036 ± 0.1669 | 0.5509 ± 0.1724 | 0.4479 ± 0.1470 | 8.3000 ± 2.6476 |
| accuracy | 0.7911 ± 0.0589 | 0.7911 ± 0.0589 | 0.7911 ± 0.0589 | 0.7911 ± 0.0589 |
| macro avg | 0.6552 ± 0.0875 | 0.6938 ± 0.0925 | 0.6585 ± 0.0858 | 51.4500 ± 0.4975 |
| weighted avg | 0.8317 ± 0.0560 | 0.7911 ± 0.0589 | 0.8055 ± 0.0538 | 51.4500 ± 0.4975 |
Test Set ===========================================================================================
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| CV = 20 | ||||
| 1 | 0.8982 ± 0.0871 | 0.9134 ± 0.0764 | 0.9027 ± 0.0655 | 18.5000 ± 1.9875 |
| 0 | 0.5048 ± 0.3519 | 0.5254 ± 0.3461 | 0.4653 ± 0.2892 | 3.5500 ± 1.9358 |
| accuracy | 0.8412 ± 0.0991 | 0.8412 ± 0.0991 | 0.8412 ± 0.0991 | 0.8412 ± 0.0991 |
| macro avg | 0.7015 ± 0.1876 | 0.7194 ± 0.1799 | 0.6840 ± 0.1644 | 22.0500 ± 0.2179 |
| weighted avg | 0.8507 ± 0.1203 | 0.8412 ± 0.0991 | 0.8376 ± 0.1084 | 22.0500 ± 0.2179 |
====================================================================================================
In order to find the parameters for our model, we can sue RandomizedSearchCV. Here, we have defined a function Best_Parm to find the best parameters.
Priors = [np.array([x, 1- x]) for x in np.arange(.25, 1, .25)]
Priors.append((y.value_counts().values/y.count()).round(2))
param_dist = {'priors': Priors, 'var_smoothing': [10**(-x) for x in range(1,11,3)]}
Header('Gaussian Naive-Bayes')
grid = Best_Parm(model = GNB, param_dist = param_dist)
Gaussian Naive-Bayes ===============================================================================
| Best Score | Best Paramerers | Precision |
|---|---|---|
| 0.4433 | {'var_smoothing': 0.1, 'priors': array([0.84, 0.16])} | 0.4607 |
| rank_test_score | params | mean_test_score | mean_fit_time |
|---|---|---|---|
| 1 | var_smoothing: 0.1, priors: array([0.84, 0.16]) | 0.4433 | 0.0022 |
| 2 | var_smoothing: 0.0001, priors: array([0.84, 0.16]) | 0.4014 | 0.0024 |
| 2 | var_smoothing: 1e-07, priors: array([0.84, 0.16]) | 0.4014 | 0.0025 |
| 2 | var_smoothing: 1e-10, priors: array([0.84, 0.16]) | 0.4014 | 0.0023 |
| 5 | var_smoothing: 0.1, priors: array([0.75, 0.25]) | 0.3636 | 0.0024 |
| 6 | var_smoothing: 0.0001, priors: array([0.75, 0.25]) | 0.3412 | 0.0022 |
| 6 | var_smoothing: 1e-07, priors: array([0.75, 0.25]) | 0.3412 | 0.0022 |
| 6 | var_smoothing: 1e-10, priors: array([0.75, 0.25]) | 0.3412 | 0.0021 |
| 9 | var_smoothing: 0.1, priors: array([0.5, 0.5]) | 0.2720 | 0.0022 |
| 10 | var_smoothing: 0.0001, priors: array([0.5, 0.5]) | 0.2648 | 0.0020 |
| 10 | var_smoothing: 1e-07, priors: array([0.5, 0.5]) | 0.2648 | 0.0024 |
| 10 | var_smoothing: 1e-10, priors: array([0.5, 0.5]) | 0.2648 | 0.0022 |
| 13 | var_smoothing: 0.0001, priors: array([0.25, 0.75]) | 0.2208 | 0.0024 |
| 13 | var_smoothing: 1e-07, priors: array([0.25, 0.75]) | 0.2208 | 0.0022 |
| 13 | var_smoothing: 1e-10, priors: array([0.25, 0.75]) | 0.2208 | 0.0020 |
| 16 | var_smoothing: 0.1, priors: array([0.25, 0.75]) | 0.2183 | 0.0027 |
Since we have identified the best parameters for our modeling, we train another model using these parameters.
Header('Gaussian Naive-Bayes')
GNB = GaussianNB(**grid.best_params_)
print('Best Parameters = %s' % GNB.get_params(deep=True))
_ = GNB.fit(X_train, y_train)
Tables_and_Plots(GNB)
Gaussian Naive-Bayes =============================================================================== Best Parameters = {'priors': array([0.84, 0.16]), 'var_smoothing': 0.1}
| Set | ROC Accuracy | Balanced Accuracy |
|---|---|---|
| Train Set | 0.7520 ± 0.0619 | 0.6957 ± 0.0821 |
| Test Set | 0.7627 ± 0.1557 | 0.6568 ± 0.1001 |
Train Set ==========================================================================================
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| CV = 20 | ||||
| 1 | 0.9050 ± 0.0501 | 0.8778 ± 0.0519 | 0.8902 ± 0.0416 | 43.1500 ± 2.6509 |
| 0 | 0.4609 ± 0.1361 | 0.5415 ± 0.1630 | 0.4843 ± 0.1319 | 8.3000 ± 2.7946 |
| accuracy | 0.8214 ± 0.0600 | 0.8214 ± 0.0600 | 0.8214 ± 0.0600 | 0.8214 ± 0.0600 |
| macro avg | 0.6830 ± 0.0743 | 0.7096 ± 0.0882 | 0.6873 ± 0.0797 | 51.4500 ± 0.4975 |
| weighted avg | 0.8382 ± 0.0651 | 0.8214 ± 0.0600 | 0.8270 ± 0.0596 | 51.4500 ± 0.4975 |
Test Set ===========================================================================================
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| CV = 20 | ||||
| 1 | 0.8796 ± 0.0968 | 0.9132 ± 0.0719 | 0.8923 ± 0.0676 | 18.5000 ± 2.0857 |
| 0 | 0.4608 ± 0.3390 | 0.4199 ± 0.3453 | 0.3903 ± 0.2741 | 3.5500 ± 2.0365 |
| accuracy | 0.8227 ± 0.1026 | 0.8227 ± 0.1026 | 0.8227 ± 0.1026 | 0.8227 ± 0.1026 |
| macro avg | 0.6702 ± 0.1760 | 0.6666 ± 0.1819 | 0.6413 ± 0.1568 | 22.0500 ± 0.2179 |
| weighted avg | 0.8302 ± 0.1145 | 0.8227 ± 0.1026 | 0.8139 ± 0.1132 | 22.0500 ± 0.2179 |
====================================================================================================
As can be seen, choosing the best parameters didn't significantly improve our performance results.